import numpy as np
from math import *
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import sys, random, time
from env import single_expert_dynamics,single_expert_stochastic_dynamics, expert1_reward, expert2_reward, expert3_reward, expert1_cost, expert2_cost, expert3_cost, feature1, feature2, feature3, expert_1_basis_constraint, expert_2_basis_constraint, expert_3_basis_constraint
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

iterations=121
num_trials=100

def reward_cost_list(trajectories,num_data):
  omega1=np.mat([1.0,-1.0]).T
  omega2=np.mat([1.0,-1.0]).T
  omega3=np.mat([1.0,-1.0]).T
  theta1=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0]).T
  theta2=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0]).T
  theta3=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0]).T
  reward_list=[]
  cost_list=[]
  for i in range(num_data):
    reward=0.0
    cost=0.0
    single_trajectory=trajectories[30*i:30*(i+1),:]
    for j in range(30):
      state1=np.mat(np.copy(single_trajectory[j][0:2])).T
      state2=np.mat(np.copy(single_trajectory[j][2:4])).T
      state3=np.mat(np.copy(single_trajectory[j][4:6])).T
      action1=np.mat(np.copy(single_trajectory[j][6])).T
      action2=np.mat(np.copy(single_trajectory[j][7])).T
      action3=np.mat(np.copy(single_trajectory[j][8])).T
      single_reward=expert1_reward(omega1,state1,action1)+expert2_reward(omega2,state2,action2)+expert3_reward(omega3,state3,action3)
      single_cost=expert1_cost(theta1,state1,action1)+expert2_cost(theta2,state2,action2)+expert3_cost(theta3,state3,action3)
      reward=reward+single_reward
      cost=cost+single_cost
    reward_list.append(reward)
    cost_list.append(cost)
  return reward_list, cost_list

def nominal_reward_cost_list(trajectories,num_data):
  omega1=np.mat([1.0,-1.0]).T
  omega2=np.mat([1.0,-1.0]).T
  omega3=np.mat([1.0,-1.0]).T
  theta1=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0]).T
  theta2=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0]).T
  theta3=np.mat([1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0]).T
  reward_list=[]
  cost_list=[]
  for i in range(num_data):
    reward=0.0
    cost=0.0
    single_trajectory=trajectories[30*i:30*(i+1),:]
    for j in range(30):
      state1=np.mat(np.copy(single_trajectory[j][0:2])).T
      state2=np.mat(np.copy(single_trajectory[j][2:4])).T
      state3=np.mat(np.copy(single_trajectory[j][4:6])).T
      action1=np.mat(np.copy(single_trajectory[j][6])).T
      action2=np.mat(np.copy(single_trajectory[j][7])).T
      action3=np.mat(np.copy(single_trajectory[j][8])).T
      if state1.item(0)<4:
        reward=reward+expert1_reward(omega1,state1,action1)
      if state2.item(0)<3:
        reward=reward+expert2_reward(omega2,state2,action2)
      if state3.item(0)<4:
        reward=reward+expert3_reward(omega3,state3,action3)
    reward_list.append(reward)
  return reward_list
a=np.loadtxt("nominal_optimal_trajectory_file.txt",dtype=float)
nominal_trajectories=a.reshape(30*num_trials,9)
nominal_reward_list=nominal_reward_cost_list(nominal_trajectories,num_trials)
nominal_reward_mean=sum(nominal_reward_list)/len(nominal_reward_list)
nominal_reward_sd=sqrt(np.var(nominal_reward_list))
print(nominal_reward_mean)
print(nominal_reward_sd)
nominal_reward_mean=nominal_reward_mean*np.ones((iterations))
nominal_reward_sd=nominal_reward_sd*np.ones((iterations))

reward1_mean=np.array(np.loadtxt("reward1_mean_file.txt",dtype=float))
reward2_mean=np.array(np.loadtxt("reward2_mean_file.txt",dtype=float))
reward3_mean=np.array(np.loadtxt("reward3_mean_file.txt",dtype=float))
reward4_mean=np.array(np.loadtxt("reward4_mean_file.txt",dtype=float))
centralized_reward_mean=np.array(np.loadtxt("centralized_reward_mean_file.txt",dtype=float))

reward1_sd=np.array(np.loadtxt("reward1_sd_file.txt",dtype=float))
reward2_sd=np.array(np.loadtxt("reward2_sd_file.txt",dtype=float))
reward3_sd=np.array(np.loadtxt("reward3_sd_file.txt",dtype=float))
reward4_sd=np.array(np.loadtxt("reward4_sd_file.txt",dtype=float))
centralized_reward_sd=np.array(np.loadtxt("centralized_reward_sd_file.txt",dtype=float))

reward1_distance=np.array(np.loadtxt("reward1_distance_file.txt",dtype=float))
reward2_distance=np.array(np.loadtxt("reward2_distance_file.txt",dtype=float))
reward3_distance=np.array(np.loadtxt("reward3_distance_file.txt",dtype=float))
reward4_distance=np.array(np.loadtxt("reward4_distance_file.txt",dtype=float))
centralized_reward_distance=np.array(np.loadtxt("reward_distance_file.txt",dtype=float))

a=np.loadtxt("optimal_trajectory_file.txt",dtype=float)
trajectories=a.reshape(30*num_trials,9)
expert_reward_list,expert_cost_list=reward_cost_list(trajectories,num_trials)
expert_reward_mean=sum(expert_reward_list)/len(expert_reward_list)
expert_reward_sd=sqrt(np.var(expert_reward_list))
expert_cost_mean=sum(expert_cost_list)/len(expert_cost_list)
expert_cost_sd=sqrt(np.var(expert_cost_list))
expert_reward_mean=expert_reward_mean*np.ones((iterations))
expert_reward_sd=expert_reward_sd*np.ones((iterations))

plt.rcParams.update({'font.size': 14})
axis=np.arange(0,iterations)
subaxis=np.arange(0,21)
fig, ax = plt.subplots()
ax.plot(axis,reward1_mean,'-')
ax.fill_between(axis,reward1_mean-reward1_sd,reward1_mean+reward1_sd,alpha=0.2)
ax.plot(axis,reward2_mean,'--')
ax.fill_between(axis,reward2_mean-reward2_sd,reward2_mean+reward2_sd,alpha=0.2)
ax.plot(axis,reward3_mean,'-.')
ax.fill_between(axis,reward3_mean-reward3_sd,reward3_mean+reward3_sd,alpha=0.2)
ax.plot(axis,reward4_mean,':')
ax.fill_between(axis,reward4_mean-reward4_sd,reward4_mean+reward4_sd,alpha=0.2)
ax.plot(axis,centralized_reward_mean,'.',alpha=0.5)
ax.fill_between(axis,centralized_reward_mean-centralized_reward_sd,centralized_reward_mean+centralized_reward_sd,alpha=0.2)
#ax.plot(axis,cumulative_reward_mean_set,'o',alpha=0.8,markersize=4,label='Centralized Learner')
#ax.fill_between(axis,cumulative_reward_mean_set-cumulative_reward_sd_set,cumulative_reward_mean_set+cumulative_reward_sd_set,alpha=0.2)
ax.plot(axis,expert_reward_mean,'^',alpha=1.0,markersize=3)
ax.fill_between(axis,expert_reward_mean-expert_reward_sd,expert_reward_mean+expert_reward_sd,alpha=0.2)
ax.plot(axis,nominal_reward_mean,'v',alpha=1.0,markersize=3)
ax.fill_between(axis,nominal_reward_mean-nominal_reward_sd,nominal_reward_mean+nominal_reward_sd,alpha=0.2)
plt.xticks(np.arange(0,iterations,40))
#plt.title('(c) Cumulative Rewards')
plt.xlabel('Inner iterations')
#plt.legend(loc=4)
axins = inset_axes(ax,
                   width="20%", # width = 30% of parent_bbox
                   height="20%", # height : 1 inch
                   bbox_to_anchor=(-300, -60, 650, 450),
                   loc=5)
#axins = zoomed_inset_axes(ax,1, loc=6)
axins.plot(subaxis,reward1_mean[0:21],'-')
axins.fill_between(subaxis,reward1_mean[0:21]-reward1_sd[0:21],reward1_mean[0:21]+reward1_sd[0:21],alpha=0.15)
axins.plot(subaxis,reward2_mean[0:21],'--')
axins.fill_between(subaxis,reward2_mean[0:21]-reward2_sd[0:21],reward2_mean[0:21]+reward2_sd[0:21],alpha=0.15)
axins.plot(subaxis,reward3_mean[0:21],'-.')
axins.fill_between(subaxis,reward3_mean[0:21]-reward3_sd[0:21],reward3_mean[0:21]+reward3_sd[0:21],alpha=0.15)
axins.plot(subaxis,reward4_mean[0:21],':')
axins.fill_between(subaxis,reward4_mean[0:21]-reward4_sd[0:21],reward4_mean[0:21]+reward4_sd[0:21],alpha=0.15)
#axins.plot(subaxis,cumulative_reward_mean_set[0:11],'o',alpha=0.8,markersize=4)
#axins.fill_between(subaxis,cumulative_reward_mean_set[0:11]-cumulative_reward_sd_set[0:11],cumulative_reward_mean_set[0:11]+cumulative_reward_sd_set[0:11],alpha=0.2)
plt.xticks(np.arange(0,21,10))
mark_inset(ax, axins, loc1=2, loc2=4,  ec="0.5")
plt.savefig('cumulative_reward.pdf') 
plt.show()

fig, ax = plt.subplots()
ax.plot(axis,reward1_distance,'-')
#ax.fill_between(axis,reward1_mean-reward1_sd,reward1_mean+reward1_sd,alpha=0.2)
ax.plot(axis,reward2_distance,'--')
#ax.fill_between(axis,reward2_mean-reward2_sd,reward2_mean+reward2_sd,alpha=0.2)
ax.plot(axis,reward3_distance,'-.')
#ax.fill_between(axis,reward3_mean-reward3_sd,reward3_mean+reward3_sd,alpha=0.2)
ax.plot(axis,reward4_distance,':')
#ax.fill_between(axis,reward4_mean-reward4_sd,reward4_mean+reward4_sd,alpha=0.2)
ax.plot(axis,centralized_reward_distance,'.',alpha=0.5)
#ax.fill_between(axis,centralized_reward_mean-centralized_reward_sd,centralized_reward_mean+centralized_reward_sd,alpha=0.2)
#ax.plot(axis,cumulative_reward_mean_set,'o',alpha=0.8,markersize=4,label='Centralized Learner')
#ax.fill_between(axis,cumulative_reward_mean_set-cumulative_reward_sd_set,cumulative_reward_mean_set+cumulative_reward_sd_set,alpha=0.2)
#ax.plot(axis,expert_reward_mean,'^',alpha=1.0,markersize=3)
#ax.fill_between(axis,expert_reward_mean-expert_reward_sd,expert_reward_mean+expert_reward_sd,alpha=0.2)
#ax.plot(axis,nominal_reward_mean,'v',alpha=1.0,markersize=3)
#ax.fill_between(axis,nominal_reward_mean-nominal_reward_sd,nominal_reward_mean+nominal_reward_sd,alpha=0.2)
plt.xticks(np.arange(0,iterations,40))
#plt.title('(c) Cumulative Rewards')
plt.xlabel('Inner iterations')

plt.savefig('reward_distance.pdf') 
plt.show()



